/* a_gen_prelim_data.do - ******************************************************

	This do file move and cleans many of the raw data sources into the PreK directory.

*******************************************************************************/

set more off
clear

* Switches to run sections of the code
local move_sat  			1
local qgis						1
local geo 						1
local nsc							1
local prektype				1
local move_mcas 			1
local extract 				1
local bps_05_12 			1

********************************* SAT ******************************************

if `move_sat' == 1 {
	use "$raw_data\MA DESE data\AP, SAT, ACT\all_SAT_dta\sat_with_sasids_by_crosswalk_2007-2016.dta", clear

	save "$stata_data_sat/sat_full.dta", replace
}
********************** Merge QGIS output with census data **********************
if `qgis' == 1 {
	do "$code/0_Data_Prep/QGIS/create_census_2000_bg.do"
}

********************************* Clean Geo Data *******************************
if `geo' == 1{
	local year "1112 1213 1314"
	local grade = "K1"

	foreach y of local year {
		foreach g of local grade {
			if 	"`y'" == "1112" {
				local ylong = "SY2011-2012"
			}
			if 	"`y'" == "1213" {
				local ylong = "SY2012-2013"
			}
			if 	"`y'" == "1314" {
				local ylong = "SY2013-2014"
			}

			import excel using "$raw_data/BPS/BPS assignment data/`ylong'/`y'Round1/Student Assignment Report (Excel).xls", clear first

			keep if Grade=="`g'"

			tempfile t`y'
			save `t`y'', replace
			local last = "`y'"
		}
	}

	foreach y of local year {
		if "`y'"!="`last'" {
			append using `t`y'', force
		}
	}
	destring GeoCode, replace

	bys GeoCode : egen sumeast = sum(IsEastBoston=="X")
	bys GeoCode : egen sumnoneast = sum(IsEastBoston=="")
	duplicates drop GeoCode, force
	gen East = sumeast>0 & sumeast!=.
	keep GeoCode East
	rename GeoCode Geo
	sort Geo
	save "$stata_data_location/EastGeos.dta", replace

	* Import and save Geocode to Census Tract crosswalk
	insheet using "$raw_data_bps/Pre-K/geo_census_intersect.csv", clear
	destring geocode, replace
	keep geocode tractce10
	tempfile tracts
	save "$stata_data_location/geo_data.dta", replace

	* Import 2000 Census Block Group characteristics
	use "$stata_data_location/geo_census_intersect_2000.dta", clear
	save "$stata_data_location/student_to_loc.dta", replace

}
********************************* NSC Data **************************************
if `nsc' == 1 {
	*Clean pull results
	import delimited using "$raw_data_bps/NSC/2020.05 Request/500708_T205676.202006291304_DA.csv",clear
	compress
	save "$stata_data_nsc/preknscpull.dta", replace

	*Merge together to give some information back to pull results (e.g., dob).
	merge m:1 requesterreturnfield using "${raw_data_bps}/NSC/2020.05 Request/complete_request.dta"
	drop _merge
	save "$stata_data_nsc/full_results.dta", replace

}

********************************* Move MCAS Data ****************
if `move_mcas' == 1 {
forvalues y = 2002/2018 {
	display "*******************************************************************"
	display "					`y'												"
	display "*******************************************************************"



	if inrange(`y',2002,2015) {
		use "$raw_data_mcas/mcas`y'.dta", clear
	}
	else if inrange(`y',2016,2018) {
		use "$raw_data_mcas/raw/mcas`y'.dta", clear
		if `y'==2016 {
			rename erawsc_sept erawsc
			rename mrawsc_sept mrawsc
			rename srawsc_sept srawsc
		}
		cap rename schcode16a school
		cap gen school = substr(sprp_sch,5,8)
	}

	// in some years this variable is teststa not teststat
	if `y'==2014 {
		rename eteststa eteststat
		rename mteststa mteststat
		rename steststa steststat
	}

	local name firstname lastname
	if `y' == 2014 {
		local name lastname
	}

	keep `name' sasid grade ealt  eteststat eperflev escaleds erawsc malt  mteststat mperflev mscaleds salt school   steststat sperflev sscaleds mrawsc srawsc dob

	*These are identifiers for missing data
	drop if inlist(sasid,"411","","++")
	duplicates tag sasid, gen(dupid)
	tab dupid
	// Of the remaining instances, select the one with the most data available
	foreach x in  e m s {
		gen numfills`x' = !missing(`x'scaleds) + !missing(`x'perflev) + !missing(`x'rawsc)
		foreach v in alt  teststat perflev scaleds rawsc {
			sort sasid school numfills`x', stable
			by sasid school: replace `x'`v' = `x'`v'[_N]	// fill in with the observation with most data
		}
	}

	* only keep necessary variables
	keep sasid *rawsc grade *alt *teststat *scaleds school
	duplicates drop
	* duplicates drop sasid, force

	foreach x in e m s {
		bys `x'rawsc grade `x'alt `x'teststat (`x'scaleds) : replace `x'scaleds = `x'scaleds[1] if missing(`x'scaleds) & !missing(`x'rawsc,grade)
	}

	foreach x in e m s {
		bys `x'scaleds grade `x'alt `x'teststat (`x'rawsc) : replace `x'rawsc = `x'rawsc[1] if missing(`x'rawsc) & !missing(`x'scaleds,grade)
	}

	foreach z in rawsc scaleds {
		foreach x in e m s {
			bys grade `x'alt  : egen m`x'`z' = mean(`x'`z')
			bys grade `x'alt  : egen sd`x'`z' = sd(`x'`z')
			gen `x'std`z' = (`x'`z' - m`x'`z')/sd`x'`z'
		}
	}

	keep sasid grade *stdrawsc *stdscaleds *teststat school // `name' dob school

	gen year = `y'

	tempfile f`y'
	save "$stata_data_mcas/`y'/mcas_sub.dta", replace

}

clear
forvalues y = 2002/2018 {
	append using "$stata_data_mcas/`y'/mcas_sub.dta"
}


compress
save "$stata_data_mcas/mcas_full.dta", replace
}

****************** Determine School Type- EECs or ESPs *************************
if `prektype' == 1 {
	insheet using "$raw_data_bps/Enrollment/Enrollment98to04/EconomistStudentExtract.txt", clear

	*Collapse on school and grade and count unique observation of studentno

	collapse (count) enrollment = studentno, by(sch grade)

	keep if inlist(grade, "K0", "K2", "K1", "01", "02", "03")
	gen grade_num = 0 if grade == "K0"
	replace grade_num = 1 if grade == "K1"
	replace grade_num = 2 if grade == "K2"
	replace grade_num = 3 if grade == "01"
	replace grade_num = 4 if grade == "02"
	replace grade_num = 5 if grade == "03"
	drop grade
	tsset sch grade_num
	tsfill

	reshape wide enrollment, i(sch) j(grade_num)

	*Gen coding for different scenarios of grade presence
	gen gr_01_K2 = enrollment3 > enrollment2 if !missing(enrollment2)
	gen gr_01_K1 = enrollment3 > enrollment1 if !missing(enrollment1)
	gen gr_K2_K1 = enrollment2 > enrollment1 if !missing(enrollment1)
	gen gr_K1_K0 = enrollment1 > enrollment0 if !missing(enrollment0)

	gen has_K1 = enrollment1 != .
	gen has_K0 = enrollment0 != .

	gen ELC = enrollment5 == . & (enrollment4 != . | enrollment3 != . | enrollment2 != . | enrollment1 != . | enrollment0 != .)
	gen nLC = ELC == 0
	egen total_enroll = rowtotal(enrollment0-enrollment3)

	tostring sch, replace
	preserve
	keep if total_enroll > 14
	keep sch has_K1 has_K0 ELC total_enroll

	save "$stata_data/school_coding_num.dta", replace
	restore

	*The list of ELCs was also confirmed using online search
	keep sch has_K1 has_K0 ELC nLC
	drop if sch == "4840" | sch == "4850"
	save "$stata_data/school_coding.dta", replace
}


********************************* Clean Economist Extract Data ****************
if `extract' == 1 {

	*Clean data for future usage
	insheet using "$raw_data_bps/Enrollment/Enrollment98to04/EconomistStudentExtract.txt", clear
	tostring sch, replace
	merge m:1 sch using "$stata_data/school_coding.dta", nogen keep(master match)
	gen year = asnyear - 60001
	sort studentno

	/************* Cleans the data to exactly match our nsc request ***************/
	// create bdate
	tostring dob, replace
	replace dob = "" if dob=="."
	replace dob = "0" + dob if length(dob)<6 & !missing(dob)
	gen mm = substr(dob,1,2)
	gen dd = substr(dob,3,2)
	gen yy = substr(dob,5,2)
	destring mm dd yy, replace
	gen yyyy = 1900 + yy if yy>60
	replace yyyy = 2000 + yy if yy<60
	gen bdate = mdy(mm,dd,yyyy)
	format bdate %td
	drop dob
	rename bdate dob


	*Clean names for better matching. Copies original NSC pull do file.
	replace firstname = upper(firstname)
	replace lastname = upper(lastname)

	split firstname, parse(" ")
	local nvars = `r(nvars)'
	forvalues j = 1/`nvars' {
		local nm1 = `j'-1
		if `nm1'>0 {
			gen mi`nm1' = ""
		}
		}

	gen jr = ""
	gen ii = ""
	gen iii = ""
	gen iv = ""

	forvalues i = 1/`nvars' {
		replace jr = "JR" if inlist(firstname`i',"JR","JUNIOR")
		replace ii = "II" if firstname`i'=="II"
		replace iii = "III" if firstname`i'=="III"
		replace iv = "IV" if firstname`i'=="IV"
		forvalues j = 1/`nvars' {
			if `j'<`nvars' {
				replace mi`j' = firstname`i' if length(firstname`i')==1 & mi`j'==""
				replace firstname`i'="" if length(firstname`i')==1 & firstname`i'==mi`j'
			}
		}
	}

	replace firstname2 = subinstr(firstname2,"JR","",.) if jr=="JR"
	replace firstname2 = subinstr(firstname2,"JUNIOR","",.) if jr=="JR"
	replace firstname2 = subinstr(firstname2,"II","",.) if ii=="II"
	replace firstname2 = subinstr(firstname2,"III","",.) if iii=="III"
	replace firstname2 = subinstr(firstname2,"IV","",.) if iv=="IV"

	gen fname = firstname1 + " " + firstname2
	replace fname = trim(fname)

	gen fi = firstname1 if length(firstname1)==1 & (length(firstname2)>1)

	split lastname, parse(" ")
	local nvars = `r(nvars)'
	forvalues i = 1/`nvars' {
		replace jr = "JR" if lastname`i'=="JR"
		replace ii = "II" if lastname`i'=="II"
		replace iii = "III" if lastname`i'=="III"
		replace iv = "IV" if lastname`i'=="IV"
	}
	replace lastname2 = "" if lastname2=="JR"
	replace lastname2 = "" if lastname2=="II"
	replace lastname2 = "" if lastname2=="III"
	replace lastname2 = "" if lastname2=="IV"


	gen lname = lastname1 + " " + lastname2
	replace lname = trim(lname)

	replace lname = subinstr(lname,"  ","",.)
	replace fname = subinstr(fname,"  ","",.)

	rename mi1 mi

	foreach x in jr ii iii iv {
		gen _`x' = !missing(`x')
		drop `x'
		rename _`x' `x'
	}

	gen sname = "JR" if jr==1
	replace sname = "II" if ii==1
	replace sname = "III" if iii==1
	replace sname = "IV" if iv==1

	sort studentno mi
	bys studentno (mi) : replace mi = mi[_N]

	duplicates tag studentno, gen(dupid)


	compress
	gen old_sch = sch
	gen not_grade_K1 = grade != "K1"
	bysort studentno (not_grade_K1 year): replace sch = sch[1]
	bysort studentno: egen no_K1 = min(not_grade_K1)
	replace sch = "" if no_K1==1

	drop bilingual geo
	save "$stata_data/cleaned_extract.dta", replace

	bysort studentno (year): keep if _n == 1
	keep fname lname mi dob studentno year
	save "$stata_data/primary_sample.dta", replace
}

******************* Processes 2005-2012 BPS Enrollment Data ********************
if `bps_05_12' == 1 {
	insheet using "$raw_data_bps/Enrollment/Enrollment05to13/BPSEnrollmentAugust2012.CSV", clear

	local tick = 0
	foreach var in year id	school	food	race	dob	geo	apt	streetno	street	zip	///
		sex	lepstatus	homelang1	homelang2	bil	sncode	homelang3	firstlang	withcode	///
		withdate	entcode	entdate	homeroom	grade {
		local ++tick
		rename v`tick' `var'
	}
	drop v26

	rename id StudentNo
	save "$stata_data/enroll05_12.dta", replace
}
